################################################################################
# I'm sorry to say the code you are reading dares to clean Mercer County NJ's
#   audit data
#
# written by sbaltz, based on a great deal of painful work by fg100, at mit,
# and shamelessly relying on Shing Chan's numpy RLE algorithm,
#   various sordid moments in 2022
################################################################################
import pandas as pd
import numpy as np
import copy


################################################################################
# Globular clusters
################################################################################
VERBOSE = False

FNAMES = ['mercer4_mturk.csv',
          'mercer12_mturk.csv']

SWITCH_IDS = {'mercer4_mturk.csv': 'president',
              'mercer12_mturk.csv': 'President'
             }

DISTS = {'mercer4_mturk.csv': 4,
         'mercer12_mturk.csv': 12
        }

#The basic observation is that the switch from one subdivision to the next is
# signaled by stating presidential vote counts. We will use RLE to find those
# switch points. Here I import an RLE algorithm written by Shing Chan, from
# https://www.kaggle.com/code/shingchan/fast-rle-with-pure-numpy
def rle(mask):
    """ mask is a binary 0/1 or True/False array """
    if not np.any(mask):
        return []
    else:
        mask = mask.reshape(-1, order='F')
        mask = np.concatenate(([False], mask, [False]))
        edges = mask[1:] ^ mask[:-1]
        idxs = np.flatnonzero(edges)  # start-end indices
        idxs += 1  # 1-indexed
        idxs[1::2] = idxs[1::2] - idxs[0::2]  # replace 'end' by 'run'
        return list(idxs)

def QuietPrint(theString):
    """Verbosity-aware printing"""
    global VERBOSE
    print(theString) if VERBOSE else None


################################################################################
# Chop chop
################################################################################
(original, audited) = (pd.DataFrame(), pd.DataFrame())
for fname in FNAMES:
    nj = pd.read_csv(f'../transcribed/{fname}',
                     encoding='utf-8',
                     low_memory=False)
    nj['district'] = DISTS[fname]

    #Separate the indices from the run lengths
    all_rle = rle(np.array(nj.office == SWITCH_IDS[fname]))
    rle_types = [_ % 2 for _ in range(len(all_rle))]
    start_inds = [None] * (len(all_rle) - sum(rle_types))
    for i in range(len(all_rle)):
        if not rle_types[i]:
            start_inds[start_inds.index(None)] = all_rle[i] - 1
    
    #Odd-numbered indices become the original dataframe while even-numbered 
    # indices become the audited dataframe
    start_inds.append(len(nj)+1)
    for i in range(len(start_inds)-1):
        stInd = start_inds[i]
        curr_df = nj.loc[stInd:start_inds[i+1]-1]
        if (i % 2):
            QuietPrint(f"audited \t {stInd}")
            audited = pd.concat([audited, curr_df])
        else:
            QuietPrint(f"original \t {stInd}")
            original = pd.concat([original, curr_df])

#audited.to_csv('testa.csv', index=False)
#original.to_csv('testo.csv', index=False)

audited.reset_index(inplace=True)
original.reset_index(inplace=True)
if len(audited) != len(original):
    QuietPrint(f"The audited dataset has {len(audited)} rows, while the" +\
               f" original dataset has {len(original)} rows")
audited = audited.loc[:min(len(audited), len(original))-1,]
original = original.loc[:min(len(audited), len(original))-1,]

res = copy.deepcopy(original)
for c in audited.columns:
    res[c+'_audited'] = audited[c].values
#Add standard variables
res = res.rename(columns = {'votes': 'original',
                            'votes_audited': 'audited'
                })
res['state'] = 'NEW JERSEY'
res['county'] = 'MERCER'
res['difference'] = res.audited - res.original
res['method'] = 'MANUAL'

res = res.drop(['candidate_audited', 'office_audited'], axis = 1)

res = res[['state','county','district','office','candidate','original',\
           'audited','difference','method']]
res.to_csv('../ready/mercer_cleaned.csv', index=False)

